Lab 1 Assignment 2

library(readr)
library(data.table)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(shiny)

1 Reading input

data_senic <- read_table2("SENIC.txt", col_names = FALSE)
## Parsed with column specification:
## cols(
##   X1 = col_integer(),
##   X2 = col_double(),
##   X3 = col_double(),
##   X4 = col_double(),
##   X5 = col_double(),
##   X6 = col_double(),
##   X7 = col_integer(),
##   X8 = col_integer(),
##   X9 = col_integer(),
##   X10 = col_integer(),
##   X11 = col_integer(),
##   X12 = col_double()
## )
setnames(data_senic, old = c("X1", "X2", "X3", "X4", "X5", "X6", "X7", "X8", "X9", "X10", "X11", "X12"), 
                     new = c("ID", "length_of_stay", "avg_age", "avg_risk", "culturing_ratio", "chest_xray_ratio",
                           "avg_beds", "school_affil", "region", "avg_daily_census", "avg_nurses", "facility_per"))

# Fixing the datatypes

data_senic$school_affil <- factor(data_senic$school_affil, levels = c(1, 2),labels = c("Yes", "No"))
data_senic$region <- factor(data_senic$region, levels = c(1, 2, 3, 4),labels = c("NE", "NC", "S", "W"))

2. Function to calculate the oultiers

quant_cal <- function(x){
  #x <-  mtcars$hp
  q1 <- as.numeric(quantile(x, probs = c(0.25)))
  q3 <- as.numeric(quantile(x, probs = c(0.75)))
  iqr <- q3-q1
  upper_value <- q3+(1.5*iqr)
  lower_value <- q1-(1.5*iqr)
  outliers_index <- which(x < lower_value | x > upper_value)
  #which(x %in% boxplot(x)$out) # must be equal to above
  return(outliers_index)
}

3. Plotting infection risk vs. outlier

ggplot(data=data_senic, aes(x=avg_risk)) + geom_density() + geom_point(data = data_senic[quant_cal(data_senic$avg_risk), c("avg_risk")], aes(y = 0, x = avg_risk), shape=18, size=2) 

Analysis: We see that the number of outliers are fairly small, however this visualization is not very insightful

4. Plotting for all numeric columns

data_senic_numeric <- data_senic[,c("ID", "length_of_stay", "avg_age", "avg_risk", "culturing_ratio", "chest_xray_ratio", "avg_beds", "avg_daily_census", "avg_nurses", "facility_per")]

p1 <- ggplot(data=data_senic_numeric, aes(x=length_of_stay)) + geom_density() + geom_point(data = data_senic_numeric[quant_cal(data_senic_numeric$length_of_stay), c("length_of_stay")], aes(y = 0, x = length_of_stay), shape=18, size=2) 
p2 <- ggplot(data=data_senic_numeric, aes(x=avg_age)) + geom_density() + geom_point(data = data_senic_numeric[quant_cal(data_senic_numeric$avg_age), c("avg_age")], aes(y = 0, x = avg_age), shape=18, size=2) 
p3 <- ggplot(data=data_senic_numeric, aes(x=culturing_ratio)) + geom_density() + geom_point(data = data_senic_numeric[quant_cal(data_senic_numeric$culturing_ratio), c("culturing_ratio")], aes(y = 0, x = culturing_ratio), shape=18, size=2) 
p4 <- ggplot(data=data_senic_numeric, aes(x=chest_xray_ratio)) + geom_density() + geom_point(data = data_senic_numeric[quant_cal(data_senic_numeric$chest_xray_ratio), c("chest_xray_ratio")], aes(y = 0, x = chest_xray_ratio), shape=18, size=2) 
p5 <- ggplot(data=data_senic_numeric, aes(x=avg_beds)) + geom_density() + geom_point(data = data_senic_numeric[quant_cal(data_senic_numeric$avg_beds), c("avg_beds")], aes(y = 0, x = avg_beds), shape=18, size=2) 
p6 <- ggplot(data=data_senic_numeric, aes(x=avg_daily_census)) + geom_density() + geom_point(data = data_senic_numeric[quant_cal(data_senic_numeric$avg_daily_census), c("avg_daily_census")], aes(y = 0, x = avg_daily_census), shape=18, size=2) 
p7 <- ggplot(data=data_senic_numeric, aes(x=avg_nurses)) + geom_density() + geom_point(data = data_senic_numeric[quant_cal(data_senic_numeric$avg_nurses), c("avg_nurses")], aes(y = 0, x = avg_nurses), shape=18, size=2) 
p8 <- ggplot(data=data_senic_numeric, aes(x=facility_per)) + geom_density() # contains no outliers
p9 <- ggplot(data=data_senic_numeric, aes(x=avg_risk)) + geom_density() + geom_point(data = data_senic_numeric[quant_cal(data_senic_numeric$avg_risk), c("avg_risk")], aes(y = 0, x = avg_risk), shape=18, size=2) 

# final output
grid.arrange(p1, p2, p3, p4, p5, p6, p7, p8, p9) # facility_per does not contain any outlier

Analysis: Most parameters are skewed towards right, while only age and risk are close to normally distributed

5. Plot of infection risk vs. number of beds

ggplot(data = data_senic, aes(x = avg_nurses, y = avg_risk)) + geom_point(aes(color = avg_beds))

Analysis: We see trend between number of nurses and risk, as the number of nurses increases risk decrease but this only happens when the average nurse >200, this may be due to the fact that medical facilities with higher risks are higher in number than low risk hospital. The risk of this colour profile is its hard to distinguish between the levels of beds.

6. Converting from ggplot2 to plotly

ggplotly(p9)

Analysis: The ability to pan and zoom are now present.

7. Pipelining the plotly function

data = data_senic_numeric[quant_cal(data_senic_numeric$avg_risk), c("avg_risk")]

data_senic %>% select(avg_risk) %>% plot_ly(x = ~avg_risk, type = "histogram") %>% 
  add_markers(x = data$avg_risk, y = 0, showlegend = FALSE, marker = list(color = "red", symbol = c('diamond')))

8. Shiny app

library(shiny)

ui <- fluidPage(

    # Application title
    titlePanel("Select the variable to plot"),
    sidebarPanel(
            column(3,checkboxGroupInput(label = "variable", "Select Column to Plot",
                                        choices = c("length_of_stay", "avg_age", "avg_risk", "culturing_ratio",                                                          "chest_xray_ratio", "avg_beds", "avg_daily_census", "avg_nurses", "facility_per"),
                                        selected = "length_of_stay"))),
    
    sliderInput("bw_adjust", label = "Bandwidth adjustment:", min = 0.2, max = 2, value = 1, step = 0.2),
            mainPanel(plotOutput('coolplot'),width = '40%'))


server <- shinyServer(function(input, output) {

    plotOutput$coolplot<-renderPlot({
            gg <- ifelse(checkboxGroupInput$variable == "length_of_stay", p1,
                         ifelse(checkboxGroupInput$variable == "avg_age", p2,
                                ifelse(checkboxGroupInput$variable == "culturing_ratio", p3,
                                       ifelse(checkboxGroupInput$variable == "chest_xray_ratio", p4,
                                              ifelse(checkboxGroupInput$variable == "avg_beds", p5,
                                                     ifelse(checkboxGroupInput$variable == "avg_daily_census", p6,
                                                            ifelse(checkboxGroupInput$variable == "avg_nurses", p7,
                                                                   ifelse(checkboxGroupInput$variable == "facility_per", p8, p9))))))))
                                                                                            
            plot(gg)
    })

})

shinyApp(ui, server)
Shiny applications not supported in static R Markdown documents